import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
dataset_path = "../dataset/houses.txt"
data = np.loadtxt(dataset_path, delimiter=",")
print(f"data.shape: {data.shape}")
x_train = data[:,:4]; y_train = data[:,-1]
print("x_train.shape: {}, y_train.shape: {}".format(x_train.shape, y_train.shape))
data.shape: (100, 5) x_train.shape: (100, 4), y_train.shape: (100,)
# let's plot the data
x_features = ['size(sqft)','bedrooms','floors','age']
fig, ax = plt.subplots(1,4, figsize=(18,4), sharey=True) # sharey = share y-axis labels + Value Range
for i in range(len(x_features)):
ax[i].scatter(x_train[:,i], y_train)
ax[i].set_xlabel(x_features[i])
ax[0].set_ylabel("Price (1000's)")
plt.show()
After z-score normalization, all features will have a mean of 0 and a standard deviation of 1.
To implement z-score normalization, adjust your input values as shown in this formula: $$x^{(i)}_j = \dfrac{x^{(i)}_j - \mu_j}{\sigma_j} \tag{4}$$ where $j$ selects a feature or a column in the $\mathbf{X}$ matrix. $µ_j$ is the mean of all the values for feature (j) and $\sigma_j$ is the standard deviation of feature (j). $$ \begin{align} \mu_j &= \frac{1}{m} \sum_{i=0}^{m-1} x^{(i)}_j \tag{5}\\ \sigma^2_j &= \frac{1}{m} \sum_{i=0}^{m-1} (x^{(i)}_j - \mu_j)^2 \tag{6} \end{align} $$
Implementation Note:
- When normalizing the features, it is important to store the values used for normalization - the mean value and the standard deviation used for the computations.
- After learning the parameters from the model, we often want to predict the prices of houses we have not seen before. Given a new x value (living room area and number of bed- rooms), we must first normalize x using the mean and standard deviation that we had previously computed from the training set.
Implementation
x_mean = np.mean(x_train, axis=0)
x_std = np.std(x_train, axis=0)
print("x_mean:{}, \nx_std:{}".format(x_mean, x_std))
x_mean:[1.41371e+03 2.71000e+00 1.38000e+00 3.86500e+01], x_std:[412.17283499 0.65261014 0.48538644 25.78502472]
def get_zscore_normalized_features(x):
"""
computes x, zcore normalized by column
Args:
x (ndarray (m,n)) : input data, m examples, n features
Returns:
x_norm (ndarray (m,n)): input normalized by column
mu (ndarray (n,)) : mean of each feature
sigma (ndarray (n,)) : standard deviation of each feature
"""
x_mean = np.mean(x, axis=0)
x_std = np.std(x, axis=0)
x_norm = (x - x_mean) / x_std
return x_norm, x_mean, x_std
x_norm, x_mean, x_std = get_zscore_normalized_features(x_train)
print(f"Peak to Peak range by column in Raw X:{np.ptp(x_train,axis=0)}")
print(f"Peak to Peak range by column in Normalized X:{np.ptp(x_norm,axis=0)}")
Peak to Peak range by column in Raw X:[2.406e+03 4.000e+00 1.000e+00 9.500e+01] Peak to Peak range by column in Normalized X:[5.83735704 6.12923357 2.06021411 3.68430905]
x_features = ['size(sqft)','bedrooms','floors','age']
fig, ax = plt.subplots(1,4, figsize=(18,4), sharey=True) # sharey = share y-axis labels + Value Range
for i in range(len(x_features)):
ax[i].scatter(x_norm[:,i], y_train)
ax[i].set_xlabel(x_features[i])
ax[0].set_ylabel("Price (1000's)")
plt.show()
def norm_plot(ax, data):
scale = (np.max(data) - np.min(data))*0.2
x = np.linspace(np.min(data)-scale,np.max(data)+scale,50)
_,bins, _ = ax.hist(data, x, color="xkcd:azure")
#ax.set_ylabel("Count")
mu = np.mean(data);
std = np.std(data);
dist = norm.pdf(bins, loc=mu, scale = std)
axr = ax.twinx()
axr.plot(bins,dist, color = "orangered", lw=2)
axr.set_ylim(bottom=0)
axr.axis('off')
fig,ax=plt.subplots(1, 4, figsize=(12, 3))
for i in range(len(ax)):
norm_plot(ax[i],x_train[:,i],)
ax[i].set_xlabel(x_features[i])
ax[0].set_ylabel("count");
fig.suptitle("distribution of features before normalization")
plt.show()
fig,ax=plt.subplots(1,4,figsize=(12,3))
for i in range(len(ax)):
norm_plot(ax[i],x_norm[:,i],)
ax[i].set_xlabel(x_features[i])
ax[0].set_ylabel("count");
fig.suptitle("distribution of features after normalization")
plt.show()